RNN using LSTM

<img src="img/RNN-rolled.png"/ width="80px" height="80px">

<img src="img/RNN-unrolled.png"/ width="400px" height="400px">

<img src="img/LSTM3-chain.png"/ width="800px" height="800px">


In [ ]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence,base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

In [ ]:
from sklearn.cross_validation import train_test_split

In [ ]:
import os
import pickle
import numpy as np
import re

In [ ]:
import pandas as pd

In [ ]:
DATA_DIRECTORY = os.path.join('../data')
print DATA_DIRECTORY

In [ ]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
    male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
    female_posts = pickle.load(female_file)

In [ ]:
filtered_male_posts = []
filtered_female_posts = []

for post_male in male_posts:
    if len(post_male) == 0:
        continue
    post_male = re.sub('\\n','',post_male)
    filtered_male_posts.append(post_male)

for post_female in female_posts:
    if len(post_female) == 0:
        continue
    post_female = re.sub('\\n','',post_female)
    filtered_female_posts.append(post_female)

In [ ]:
all_posts = []

In [ ]:
all_posts.extend(filtered_male_posts)
all_posts.extend(filtered_female_posts)

In [ ]:
type(all_posts)

In [ ]:
all_posts[1]

In [ ]:
len(all_posts),len(filtered_male_posts),len(filtered_female_posts)

In [ ]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(filtered_male_posts)),np.ones(len(filtered_female_posts))))

In [ ]:
char_list = list(set(''.join(all_posts)))

In [ ]:
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))

In [ ]:
label_indices = {'male':0,'female':1}
indices_label = {0:'male',1:'female'}

In [ ]:
MAX_LENGTH = 0
i = 0
MAX_INDEX = 0
for i,n in enumerate(all_posts):
    
    if len(n) > MAX_LENGTH:
        MAX_LENGTH = len(n)
        MAX_INDEX = i
        
print(MAX_LENGTH,MAX_INDEX)

In [ ]:
MAX_LENGTH = 5000

In [ ]:
def blog_to_char_seq(blog):
    blog_chars = list(blog)
    blog_chars_indices = list(map(lambda char: char_indices[char], blog_chars))
    return sequence.pad_sequences([blog_chars_indices], maxlen=MAX_LENGTH)[0]

In [ ]:
X = []
y = []

for n, l in zip(all_posts, concatenate_array_rnn):
    X.append(blog_to_char_seq(n))
    y.append(l)
    
X = np.array(X).astype(np.uint8)
y = np.array(y)

print(X.shape, y.shape)

In [ ]:
y

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [ ]:
len(char_list)

In [ ]:
model = Sequential()
model.add(Embedding(len(char_list), 32, input_length=MAX_LENGTH, mask_zero=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.1))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [ ]:
model.compile(loss='binary_crossentropy',optimizer='adagrad', metrics=["accuracy"])

In [ ]:
model.fit(X_train,y_train,
          batch_size=32,nb_epoch=2,
          validation_split=0.1,
          verbose=1)

In [ ]:
model.evaluate(X_test,y_test,batch_size=32)

In [ ]:
predicted_output = model.predict(X_test,batch_size=32)
predicted_classes = model.predict_classes(X_test, batch_size=32)

In [ ]:
df = pd.DataFrame(columns=['predicted','actual'])

In [ ]:
df['predicted_class'] = predicted_classes.flatten()
df['predicted'] = predicted_output.flatten()

In [ ]:
df['actual'] = y_test

In [ ]:
df.predicted_class.value_counts()

In [ ]:
df.actual.value_counts()

In [ ]: